import thulac
import re
import jieba
thu = thulac.thulac(seg_only=True)

#获取中文停用词
stopwordsPath = '../ChineseStopwords.txt'
f = open(stopwordsPath,'r')
stopwords = [word for word in f.read().split('\n')]
def cut_words(line):
    '''
    :param line: 原始文本
    :return: 去除停用词和分词后的文本,类型为str
    '''
    line = jieba.lcut(line)#text=true,获取str类型的文本
    #这里加一个正则匹配去除符号
    # line = re.sub(r'[^\w\s]','',line)#去除标点符号
    cut_line =' '.join([word  for word in line if word not in stopwords])
    print(cut_line)
    return cut_line
file_path ='../data/demo.txt'
with open(file_path,'r',encoding='utf-8') as f1:
    inllg_list = f1.readlines()
    new_data = [cut_words(data.strip()) for data in inllg_list]#分词
